# Data handling
import pandas as pd
import numpy as np
import datetime
from math import sqrt
from numpy import loadtxt
from itertools import product
from tqdm import tqdm
# Visualisations
!pip install chart_studio
import seaborn as sns
import matplotlib.pyplot as plt
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from scipy import stats
from wordcloud import WordCloud
from wordcloud import STOPWORDS
# Machine learning
import xgboost as xgb
from sklearn import preprocessing
from xgboost import plot_tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from lightgbm import LGBMRegressor
Requirement already satisfied: chart_studio in c:\users\krish\anaconda3\lib\site-packages (1.1.0) Requirement already satisfied: plotly in c:\users\krish\anaconda3\lib\site-packages (from chart_studio) (5.9.0) Requirement already satisfied: retrying>=1.3.3 in c:\users\krish\anaconda3\lib\site-packages (from chart_studio) (1.3.3) Requirement already satisfied: requests in c:\users\krish\anaconda3\lib\site-packages (from chart_studio) (2.28.1) Requirement already satisfied: six in c:\users\krish\anaconda3\lib\site-packages (from chart_studio) (1.16.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\krish\anaconda3\lib\site-packages (from plotly->chart_studio) (8.0.1) Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\krish\anaconda3\lib\site-packages (from requests->chart_studio) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\krish\anaconda3\lib\site-packages (from requests->chart_studio) (3.3) Requirement already satisfied: certifi>=2017.4.17 in c:\users\krish\anaconda3\lib\site-packages (from requests->chart_studio) (2022.9.24) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\krish\anaconda3\lib\site-packages (from requests->chart_studio) (1.26.11)
training_data = pd.read_csv("sales_train.csv",parse_dates = ['date'], infer_datetime_format = True, dayfirst = True)
test = pd.read_csv("test.csv")
items_data = pd.read_csv("items.csv")
shops_data = pd.read_csv("shops.csv")
item_categories = pd.read_csv("item_categories.csv")
sample_submission = pd.read_csv("sample_submission.csv")
# Showcase the initial 5 rows of the training_data dataset which needs to be modelled for future prediction
training_data.head()
| date | date_block_num | shop_id | item_id | item_price | item_cnt_day | |
|---|---|---|---|---|---|---|
| 0 | 2013-01-02 | 0 | 59 | 22154 | 999.00 | 1.0 |
| 1 | 2013-01-03 | 0 | 25 | 2552 | 899.00 | 1.0 |
| 2 | 2013-01-05 | 0 | 25 | 2552 | 899.00 | -1.0 |
| 3 | 2013-01-06 | 0 | 25 | 2554 | 1709.05 | 1.0 |
| 4 | 2013-01-15 | 0 | 25 | 2555 | 1099.00 | 1.0 |
# Showcase the initial 5 rows of the test dataset
# It contains item_id, shop_id
# Need to predict item_cnt_day
test.head()
| ID | shop_id | item_id | |
|---|---|---|---|
| 0 | 0 | 5 | 5037 |
| 1 | 1 | 5 | 5320 |
| 2 | 2 | 5 | 5233 |
| 3 | 3 | 5 | 5232 |
| 4 | 4 | 5 | 5268 |
# Showcase the initial 5 rows of the items dataset
# It contains item_name, item_id, item_category_id
items_data.head()
| item_name | item_id | item_category_id | |
|---|---|---|---|
| 0 | ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D | 0 | 40 |
| 1 | !ABBYY FineReader 12 Professional Edition Full... | 1 | 76 |
| 2 | ***В ЛУЧАХ СЛАВЫ (UNV) D | 2 | 40 |
| 3 | ***ГОЛУБАЯ ВОЛНА (Univ) D | 3 | 40 |
| 4 | ***КОРОБКА (СТЕКЛО) D | 4 | 40 |
# Showcase the initial 5 rows of the shops dataset
# It contains shop_name, shop_id
shops_data.head()
| shop_name | shop_id | |
|---|---|---|
| 0 | !Якутск Орджоникидзе, 56 фран | 0 |
| 1 | !Якутск ТЦ "Центральный" фран | 1 |
| 2 | Адыгея ТЦ "Мега" | 2 |
| 3 | Балашиха ТРК "Октябрь-Киномир" | 3 |
| 4 | Волжский ТЦ "Волга Молл" | 4 |
# Showcase the initial 5 rows of the items_categories dataset
# It contains item_category_name and id
item_categories.head()
| item_category_name | item_category_id | |
|---|---|---|
| 0 | PC - Гарнитуры/Наушники | 0 |
| 1 | Аксессуары - PS2 | 1 |
| 2 | Аксессуары - PS3 | 2 |
| 3 | Аксессуары - PS4 | 3 |
| 4 | Аксессуары - PSP | 4 |
# Showcase the initial 5 rows of sample submission dataset
sample_submission.head()
| ID | item_cnt_month | |
|---|---|---|
| 0 | 0 | 0.5 |
| 1 | 1 | 0.5 |
| 2 | 2 | 0.5 |
| 3 | 3 | 0.5 |
| 4 | 4 | 0.5 |
#Size of all the datasets
print("Size of Training Dataset: ", training_data.shape)
print("Size of Testing Dataset: ", test.shape)
print("Size of Items Dataset: ", items_data.shape)
print("Size of Shops Dataset: ", shops_data.shape)
print("Size of Item Categories Dataset, ", item_categories.shape)
print("Size of Sample Submssion Dataset: ", sample_submission.shape)
Size of Training Dataset: (2935849, 6) Size of Testing Dataset: (214200, 3) Size of Items Dataset: (22170, 3) Size of Shops Dataset: (60, 2) Size of Item Categories Dataset, (84, 2) Size of Sample Submssion Dataset: (214200, 2)
# Merging all the files with common parameters as shop_id, item_id, item_category_id
df = pd.merge(training_data, shops_data, how = 'left', left_on = 'shop_id', right_on = 'shop_id')
df = pd.merge(df, items_data, how = 'left', left_on = 'item_id', right_on = 'item_id')
df = pd.merge(df, item_categories, how = 'left', left_on = 'item_category_id', right_on = 'item_category_id')
df.drop(['item_id', 'item_category_id', 'shop_id'], axis = 1, inplace = True)
df.head()
| date | date_block_num | item_price | item_cnt_day | shop_name | item_name | item_category_name | |
|---|---|---|---|---|---|---|---|
| 0 | 2013-01-02 | 0 | 999.00 | 1.0 | Ярославль ТЦ "Альтаир" | ЯВЛЕНИЕ 2012 (BD) | Кино - Blu-Ray |
| 1 | 2013-01-03 | 0 | 899.00 | 1.0 | Москва ТРК "Атриум" | DEEP PURPLE The House Of Blue Light LP | Музыка - Винил |
| 2 | 2013-01-05 | 0 | 899.00 | -1.0 | Москва ТРК "Атриум" | DEEP PURPLE The House Of Blue Light LP | Музыка - Винил |
| 3 | 2013-01-06 | 0 | 1709.05 | 1.0 | Москва ТРК "Атриум" | DEEP PURPLE Who Do You Think We Are LP | Музыка - Винил |
| 4 | 2013-01-15 | 0 | 1099.00 | 1.0 | Москва ТРК "Атриум" | DEEP PURPLE 30 Very Best Of 2CD (Фирм.) | Музыка - CD фирменного производства |
# The information of the dataframe created by merging the datasets
# The datatypes of the attributes are shown specifically
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2935849 entries, 0 to 2935848 Data columns (total 7 columns): # Column Dtype --- ------ ----- 0 date datetime64[ns] 1 date_block_num int64 2 item_price float64 3 item_cnt_day float64 4 shop_name object 5 item_name object 6 item_category_name object dtypes: datetime64[ns](1), float64(2), int64(1), object(3) memory usage: 179.2+ MB
# Size of merged dataset before removing duplicates
df.shape
(2935849, 7)
# Check for duplicates
df.duplicated().sum()
6
# Remove the duplicates
df.drop_duplicates(keep = 'first', inplace = True)
# Size of merged dataset after removing duplicates
df.shape
(2935843, 7)
# Basic statistical information about the data (mean, sd, min, max)
df.describe()
| date_block_num | item_price | item_cnt_day | |
|---|---|---|---|
| count | 2.935843e+06 | 2.935843e+06 | 2.935843e+06 |
| mean | 1.456991e+01 | 8.908535e+02 | 1.242641e+00 |
| std | 9.422992e+00 | 1.729801e+03 | 2.618837e+00 |
| min | 0.000000e+00 | -1.000000e+00 | -2.200000e+01 |
| 25% | 7.000000e+00 | 2.490000e+02 | 1.000000e+00 |
| 50% | 1.400000e+01 | 3.990000e+02 | 1.000000e+00 |
| 75% | 2.300000e+01 | 9.990000e+02 | 1.000000e+00 |
| max | 3.300000e+01 | 3.079800e+05 | 2.169000e+03 |
# Explore the distbution sales data
data = training_data[['shop_id','item_price', 'item_cnt_day']]
def num_plot(df, var):
fig = plt.figure(figsize = (16, 5))
plt.subplot(1,3,1)
df[var].hist(bins = 40)
plt.title(f"Distribution of {var}")
plt.subplot(1,3,2)
stats.probplot(df[var], dist = "norm", plot = plt)
plt.ylabel('Quantiles')
plt.subplot(1,3,3)
sns.boxplot(y = df[var],color="red")
plt.title("Boxplot")
plt.show()
for var in data:
num_plot(data, var)
# Make a plot to show the frequence of sales among shops
plt.figure(figsize = (20, 20))
sns.countplot(y=df['shop_name'], palette = ("Greens"))
plt.xlabel('Number of sold items in each shop')
plt.ylabel('Name of the shop')
Text(0, 0.5, 'Name of the shop')
# Show shop daily sales
shop_daily_sales = training_data.groupby(['shop_id', 'date_block_num'], as_index = False)['item_cnt_day'].sum()
shop_daily_sales_sc = []
for shop in shop_daily_sales['shop_id'].unique():
current_shop_daily_sales = shop_daily_sales[(shop_daily_sales['shop_id']) == shop]
shop_daily_sales_sc.append(go.Scatter(x=current_shop_daily_sales['date_block_num'], y = current_shop_daily_sales['item_cnt_day'], name=('shop_id %s' % shop)))
layout = go.Layout(title='Shop daily sales', xaxis=dict(title='Time (2013 - 2015)'), yaxis=dict(title='Sales'))
fig = go.Figure(data = shop_daily_sales_sc, layout=layout)
iplot(fig)
# Output the first 10 best selling items
best_selling_items = df['item_name'].value_counts().head(10)
best_selling_items = best_selling_items.to_frame().reset_index()
best_selling_items.columns = ['item_name', 'item_count']
best_selling_items.head(10)
| item_name | item_count | |
|---|---|---|
| 0 | Фирменный пакет майка 1С Интерес белый (34*42)... | 31340 |
| 1 | Playstation Store пополнение бумажника: Карта ... | 9408 |
| 2 | Прием денежных средств для 1С-Онлайн | 9067 |
| 3 | Diablo III [PC, Jewel, русская версия] | 7479 |
| 4 | Kaspersky Internet Security Multi-Device Russi... | 6853 |
| 5 | World of Warcraft. Карта оплаты игрового време... | 6602 |
| 6 | Grand Theft Auto V [PS3, русские субтитры] | 6475 |
| 7 | Call of Duty: Black Ops II [PС, Jewel, русская... | 6320 |
| 8 | Minecraft [Xbox 360, английская версия] | 5811 |
| 9 | Grand Theft Auto V [Xbox 360, русские субтитры] | 5805 |
# Plotting a graph
fig = plt.figure(figsize = (20, 10))
sns.barplot(x=best_selling_items['item_count'], y=best_selling_items['item_name'], data=best_selling_items)
plt.xlabel('Item Count')
plt.ylabel('Item Name')
Text(0, 0.5, 'Item Name')
# Output the first 10 best selling categories
best_selling_categories = df['item_category_name'].value_counts().head(10)
best_selling_categories = best_selling_categories.to_frame().reset_index()
best_selling_categories.columns = ['item_category_name', 'count']
best_selling_categories.head(10)
| item_category_name | count | |
|---|---|---|
| 0 | Кино - DVD | 564651 |
| 1 | Игры PC - Стандартные издания | 351591 |
| 2 | Музыка - CD локального производства | 339585 |
| 3 | Игры - PS3 | 208219 |
| 4 | Кино - Blu-Ray | 192673 |
| 5 | Игры - XBOX 360 | 146785 |
| 6 | Игры PC - Дополнительные издания | 121539 |
| 7 | Игры - PS4 | 79058 |
| 8 | Подарки - Мягкие игрушки | 53845 |
| 9 | Подарки - Настольные игры (компактные) | 53227 |
# Plotting a graph
fig = plt.figure(figsize = (20, 10))
sns.barplot(x = best_selling_categories['count'], y = best_selling_categories['item_category_name'], data = best_selling_categories)
plt.xlabel('Category Count')
plt.ylabel('Item Name')
Text(0, 0.5, 'Item Name')
# Making a word cloud for shop name
plt.rcParams['figure.figsize'] = (5, 5)
stopwords = set(STOPWORDS)
wordcloud1 = WordCloud(background_color = 'pink',
max_words = 200,
stopwords = stopwords,
width = 1000,
height = 1000,
random_state = 42).generate(str(shops_data['shop_name']))
plt.title('Wordcloud for Shop Names', fontsize = 30)
plt.axis('off')
plt.imshow(wordcloud1, interpolation = 'bilinear')
<matplotlib.image.AxesImage at 0x23992160760>
# Making a word cloud for item name
plt.rcParams['figure.figsize'] = (5, 5)
stopwords = set(STOPWORDS)
wordcloud2 = WordCloud(background_color = 'lightgreen',
max_words = 200,
stopwords = stopwords,
width = 1000,
height = 1000,
random_state = 42).generate(str(items_data['item_name']))
plt.title('Wordcloud for Item Names', fontsize = 30)
plt.axis('off')
plt.imshow(wordcloud2, interpolation = 'bilinear')
<matplotlib.image.AxesImage at 0x239a35eb9a0>
# Making a word cloud for item categories name
plt.rcParams['figure.figsize'] = (5, 5)
stopwords = set(STOPWORDS)
wordcloud3 = WordCloud(background_color = 'lightblue',
max_words = 200,
stopwords = stopwords,
width = 1000,
height = 1000,
random_state = 42).generate(str(item_categories['item_category_name']))
plt.title('Wordcloud for Item Category Names', fontsize = 30)
plt.axis('off')
plt.imshow(wordcloud3, interpolation = 'bilinear')
<matplotlib.image.AxesImage at 0x239924c02b0>
# Calculate IQR
Q1 = np.percentile(df.item_price,25)/100
Q3 = np.percentile(df.item_price,75)/100
IQR = Q3-Q1
# Drop item_price 1.5 times IQR above the below the max and min item_price
# Drop item_cnt_day with standard deviation higher than 3
training_data.drop(training_data[training_data.item_price > 1.5*IQR + training_data.item_price.max()].index, inplace = True)
training_data.drop(training_data[training_data.item_price < 1.5*IQR + training_data.item_price.min()].index, inplace = True)
training_data.drop(training_data[training_data.item_cnt_day > 3 * training_data.item_cnt_day.std()].index, inplace = True)
# There are a few negative item_counts
# Thus we need to eliminate negative values
training_data.drop(training_data[training_data.item_price < 0].index, inplace = True)
training_data.drop(training_data[training_data.item_cnt_day < 0].index, inplace = True)
# Calculate revenue
training_data['revenue'] = training_data.item_price * training_data.item_cnt_day
# Group data based on shop id and item id then get value of sales per month
pivot_training_data = training_data.pivot_table(index=['shop_id', 'item_id'], columns='date_block_num', values='item_cnt_day', aggfunc='sum').fillna(0.0)
pivot_training_data.head(20)
| date_block_num | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| shop_id | item_id | |||||||||||||||||||||
| 0 | 30 | 0.0 | 22.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 31 | 0.0 | 11.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 32 | 6.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 33 | 3.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 35 | 1.0 | 14.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 36 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 40 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 42 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 43 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 49 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 51 | 2.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 57 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 59 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 61 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 75 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 85 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 88 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 95 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 96 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 97 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
20 rows × 34 columns
# Size of data after removing outliers
print("Size of Training Dataset after removing outliers: ",training_data.shape)
# Size of data after creating pivot table
print("Size of Pivot Dataset: ",pivot_training_data.shape)
Size of Training Dataset after removing outliers: (2874701, 7) Size of Pivot Dataset: (423533, 34)
# Data Cleaning
training_df = pivot_training_data.reset_index()
training_df["shop_id"] = training_df.shop_id.astype("str")
training_df["item_id"] = training_df.item_id.astype("str")
# Merge the items with categories
item_cat_df = items_data.merge(item_categories[['item_category_id','item_category_name']], how="inner", on="item_category_id")[['item_id','item_category_name']]
item_cat_df['item_id'] = item_cat_df.item_id.astype('str')
# Merge the train dataframe with the items and categories dataframe
training_df = training_df.merge(item_cat_df, how="inner", on="item_id")
# Perform Label encoding
number = preprocessing.LabelEncoder()
# Check the variables in the train dataframe
training_df
| shop_id | item_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | ... | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | item_category_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 30 | 0.0 | 22.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Кино - DVD |
| 1 | 1 | 30 | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Кино - DVD |
| 2 | 2 | 30 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Кино - DVD |
| 3 | 3 | 30 | 0.0 | 4.0 | 5.0 | 2.0 | 2.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | Кино - DVD |
| 4 | 4 | 30 | 0.0 | 7.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Кино - DVD |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 423528 | 59 | 9760 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Кино - Blu-Ray |
| 423529 | 59 | 12846 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Кино - Blu-Ray |
| 423530 | 59 | 16270 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Программы - Для дома и офиса |
| 423531 | 59 | 17732 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Игры PC - Дополнительные издания |
| 423532 | 59 | 18018 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Музыка - MP3 |
423533 rows × 37 columns
# Encode the column with categorical data
training_df['item_category_name'] = number.fit_transform(training_df['item_category_name'])
# Rearrange the columns
training_df = training_df[['shop_id', 'item_id', 'item_category_name'] + list(range(34))]
training_df.head()
| shop_id | item_id | item_category_name | 0 | 1 | 2 | 3 | 4 | 5 | 6 | ... | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 30 | 40 | 0.0 | 22.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 1 | 30 | 40 | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 2 | 30 | 40 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 3 | 30 | 40 | 0.0 | 4.0 | 5.0 | 2.0 | 2.0 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 4 | 4 | 30 | 40 | 0.0 | 7.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 37 columns
# Modelling
param = {'max_depth': 13,
'subsample':1,
'min_child_weight': 0.7,
'eta':0.3,
'num_round':1000,
'seed':1,
'silent':0,
'eval_metric':'rmse',
'n_estimators': 120}
# Generate a DMatrix to train the model
xgbtrain = xgb.DMatrix(training_df.iloc[:, (training_df.columns != 33)].values, training_df.iloc[:, training_df.columns == 33].values)
watchlist = [(xgbtrain,'train-rmse')]
# Train the model
bst = xgb.train(param, xgbtrain)
preds = bst.predict(xgb.DMatrix(training_df.iloc[:, (training_df.columns != 33)].values))
rmse = np.sqrt(mean_squared_error(preds,training_df.iloc[:, training_df.columns == 33].values))
# Print the RMSE of the model
print("RMSE Value: ",rmse)
[16:56:06] WARNING: C:/buildkite-agent/builds/buildkite-windows-cpu-autoscaling-group-i-03de431ba26204c4d-1/xgboost/xgboost-ci-windows/src/learner.cc:767:
Parameters: { "n_estimators", "num_round", "silent" } are not used.
RMSE Value: 0.4302462208714635
# See the the most important features
fig, ax = plt.subplots(figsize=(20, 10))
xgb.plot_importance(bst, ax = ax, importance_type='gain')
<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>
# Preparing the test data frame
test_df = test
test_df.shop_id = test_df.shop_id.astype('str')
test_df.item_id = test_df.item_id.astype('str')
test_df = test.merge(training_df, how="left", on=["shop_id", "item_id"]).fillna(0.0)
# Check the first five variables of the test dataframe
test_df.head()
| ID | shop_id | item_id | item_category_name | 0 | 1 | 2 | 3 | 4 | 5 | ... | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 5 | 5037 | 19.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 3.0 | 1.0 | 0.0 |
| 1 | 1 | 5 | 5320 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 2 | 5 | 5233 | 19.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 2.0 | 0.0 | 1.0 | 3.0 | 1.0 |
| 3 | 3 | 5 | 5232 | 23.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 4 | 4 | 5 | 5268 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 38 columns
d = dict(zip(test_df.columns[4:],list(np.array(list(test_df.columns[4:])) - 1)))
test_df = test_df.rename(d, axis = 1)
# Make sales predictions for the test data
predictions = bst.predict(xgb.DMatrix(test_df.iloc[:, (test_df.columns != 'ID') & (test_df.columns != -1)].values))
predictions = list(map(lambda x: min(20, max(x,0)), list(predictions)))
submission = pd.DataFrame({'ID': test_df.ID, 'item_cnt_month':predictions})
submission['item_cnt_month'] = round(submission['item_cnt_month'] , 2)
submission.to_csv('submission1.csv', index=False)
print(submission)
ID item_cnt_month 0 0 0.46 1 1 0.15 2 2 0.32 3 3 0.13 4 4 0.15 ... ... ... 214195 214195 0.14 214196 214196 0.13 214197 214197 0.05 214198 214198 0.13 214199 214199 0.05 [214200 rows x 2 columns]
# Make a dataset with only monthly sales data
data = training_data.groupby([training_data['date'].apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).sum().reset_index()
# Specify the important attributes which we want to add to the data
data = data[['date','item_id','shop_id','item_cnt_day']]
# Group data based on shop id and item id then get value of sales per month
data = data.pivot_table(index=['item_id','shop_id'], columns = 'date', values = 'item_cnt_day', fill_value = 0).reset_index()
data.shape
(423533, 36)
# Merge the monthly sales data prepared to the test data set
test = pd.read_csv("test.csv")
test = pd.merge(test, data, on = ['item_id', 'shop_id'], how = 'left')
# Filling the empty values found in the dataset
test.fillna(0, inplace = True)
test.head()
| ID | shop_id | item_id | 2013-01 | 2013-02 | 2013-03 | 2013-04 | 2013-05 | 2013-06 | 2013-07 | ... | 2015-01 | 2015-02 | 2015-03 | 2015-04 | 2015-05 | 2015-06 | 2015-07 | 2015-08 | 2015-09 | 2015-10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 5 | 5037 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 3.0 | 1.0 | 0.0 |
| 1 | 1 | 5 | 5320 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 2 | 5 | 5233 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 2.0 | 0.0 | 1.0 | 3.0 | 1.0 |
| 3 | 3 | 5 | 5232 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 4 | 4 | 5 | 5268 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 37 columns
# Create the actual training data
x_train = test.drop(['2015-10', 'item_id', 'shop_id'], axis = 1)
y_train = test['2015-10']
# Deleting the first column so that it can predict the future sales data
x_test = test.drop(['2013-01', 'item_id', 'shop_id'], axis = 1)
print("Shape of x_train :", x_train.shape)
print("Shape of x_test :", x_test.shape)
print("Shape of y_train :", y_train.shape)
Shape of x_train : (214200, 34) Shape of x_test : (214200, 34) Shape of y_train : (214200,)
# Splitting the data into train and valid dataset
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)
print("Shape of x_train :", x_train.shape)
print("Shape of x_valid :", x_valid.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_valid :", y_valid.shape)
Shape of x_train : (171360, 34) Shape of x_valid : (42840, 34) Shape of y_train : (171360,) Shape of y_valid : (42840,)
# MODELING
model_lgb = LGBMRegressor( n_estimators=200,
learning_rate=0.03,
num_leaves=32,
colsample_bytree=0.9497036,
subsample=0.8715623,
max_depth=8,
reg_alpha=0.04,
reg_lambda=0.073,
min_split_gain=0.0222415,
min_child_weight=40)
model_lgb.fit(x_train, y_train)
y_pred_lgb = model_lgb.predict(x_test)
# Generate the model
y_pred_lgb = model_lgb.predict(x_test).clip(0., 20.)
# Create the submission file and submit
preds = pd.DataFrame(y_pred_lgb, columns=['item_cnt_month'])
preds.to_csv('submission2.csv',index_label='ID')
submission.to_csv('submission2.csv', index=False)
print(submission)
ID item_cnt_month 0 0 0.46 1 1 0.15 2 2 0.32 3 3 0.13 4 4 0.15 ... ... ... 214195 214195 0.14 214196 214196 0.13 214197 214197 0.05 214198 214198 0.13 214199 214199 0.05 [214200 rows x 2 columns]
Prediction Model 2 has better outcome. The submission file had item counts with decimal value greater than 5 which was then rounded to 2digits after decimal. The rsme value of the first model dropped from 0.8 to 0.43 by changing the max_depth parameters appropriately.